import pandas as pd


df = pd.read_csv("QueryResults.csv", names=["Date", "Tag", "Posts"], header=0)


df.head()


df.shape

(1991, 3)


df.count()

Date     1991
Tag      1991
Posts    1991
dtype: int64


df.groupby("Tag").sum()


df.groupby("Tag").count()


df.Date[0]

'2008-07-01 00:00:00'


type(df["Date"][0])

str


df["Date"] = pd.to_datetime(df["Date"])


test_df = pd.DataFrame({'Age': ['Young', 'Young', 'Young', 'Young', 'Old', 'Old', 'Old'],
                        'Actor': ['Jack', 'Arnold', 'Keanu', 'Sylvester', 'Jack', 'Arnold', 'Keanu'],
                        'Power': [100, 80, 25, 50, 99, 75, 5]})
pivoted_df = test_df.pivot(index="Age", columns="Actor", values="Power")
pivoted_df
# test_df


reshaped_df = df.pivot(index="Date", columns="Tag", values="Posts")
reshaped_df.head()


reshaped_df.shape

(145, 14)


reshaped_df.columns

Index(['assembly', 'c', 'c#', 'c++', 'delphi', 'go', 'java', 'javascript',
       'perl', 'php', 'python', 'r', 'ruby', 'swift'],
      dtype='object', name='Tag')


reshaped_df.count()

Tag
assembly      144
c             144
c#            145
c++           144
delphi        144
go            129
java          144
javascript    144
perl          144
php           144
python        144
r             142
ruby          144
swift         135
dtype: int64


reshaped_df.fillna(0, inplace=True)


reshaped_df.isna().values.any()

False


import matplotlib.pyplot as plt


plt.figure(figsize=(16,10))
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.plot(reshaped_df.index, reshaped_df["java"])

[<matplotlib.lines.Line2D at 0x7fb2520963d0>]


plt.figure(figsize=(16,10))
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel("Date", fontsize=14)
plt.ylabel("Number of Posts", fontsize=14)
plt.ylim(0, 25000)
plt.plot(reshaped_df.index, reshaped_df["java"])

[<matplotlib.lines.Line2D at 0x7fb261598760>]


plt.figure(figsize=(16,10))
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel("Date", fontsize=14)
plt.ylabel("Number of Posts", fontsize=14)
plt.ylim(0, 35000)

plt.plot(reshaped_df.index, reshaped_df["java"])
plt.plot(reshaped_df.index, reshaped_df["python"])

[<matplotlib.lines.Line2D at 0x7fb240388160>]


plt.figure(figsize=(16,10))
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel("Date", fontsize=14)
plt.ylabel("Number of Posts", fontsize=14)
plt.ylim(0, 35000)

for column in reshaped_df.columns:
    plt.plot(reshaped_df.index, reshaped_df[column], linewidth=3, label=reshaped_df[column].name)
    
plt.legend(fontsize=16)

<matplotlib.legend.Legend at 0x7fb220297970>


# The window is number of observations that are averaged
roll_df = reshaped_df.rolling(window=6).mean()

plt.figure(figsize=(16,10))
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Number of Posts', fontsize=14)
plt.ylim(0, 35000)
 
# plot the roll_df instead
for column in roll_df.columns:
    plt.plot(roll_df.index, roll_df[column], 
             linewidth=3, label=roll_df[column].name)
 
plt.legend(fontsize=16)

<matplotlib.legend.Legend at 0x7fb2403eebb0>


import datetime as dt
# idxmax() returns Series object 
answer = df[(df['Date'] >= dt.datetime(2008,1,1)) & 
            (df['Date'] <= dt.datetime(2012,12,31))].groupby('Tag').sum().loc[
                ["c#", "javascript", "python", "assembly"]].idxmax()["Posts"]
answer

'c#'


option_list = ["python", "java", "javascript", "php"]
answer_2 = df[(df["Date"] >= dt.datetime(2015, 1, 1)) & 
              (df['Date'] <= dt.datetime(2018, 12, 31))].groupby("Tag").sum().loc[option_list].idxmax()["Posts"]
answer_2

'javascript'

Tag	assembly	c	c#	c++	delphi	go	java	javascript	perl	php	python	r	ruby	swift
Date
2008-07-01	NaN	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2008-08-01	8.0	85.0	511.0	164.0	14.0	NaN	222.0	162.0	28.0	161.0	124.0	NaN	73.0	NaN
2008-09-01	28.0	321.0	1649.0	755.0	105.0	NaN	1137.0	640.0	131.0	482.0	542.0	6.0	290.0	NaN
2008-10-01	15.0	303.0	1989.0	811.0	112.0	NaN	1153.0	725.0	127.0	617.0	510.0	NaN	249.0	NaN
2008-11-01	17.0	259.0	1730.0	735.0	141.0	NaN	958.0	579.0	97.0	504.0	452.0	1.0	160.0	NaN

Fame of Programming Languages¶

Get the Data¶

Import Statements¶

Data Exploration¶

Data Cleaning¶

Data Manipulation¶

Data Visualisaton with with Matplotlib¶

Smoothing out Time Series Data¶

	Date	Tag	Posts
0	2008-07-01 00:00:00	c#	3
1	2008-08-01 00:00:00	assembly	8
2	2008-08-01 00:00:00	javascript	162
3	2008-08-01 00:00:00	c	85
4	2008-08-01 00:00:00	python	124

	Posts
Tag
assembly	34852
c	336042
c#	1423530
c++	684210
delphi	46212
go	47499
java	1696403
javascript	2056510
perl	65286
php	1361988
python	1496210
r	356799
ruby	214582
swift	273055

Actor	Arnold	Jack	Keanu	Sylvester
Age
Old	75.0	99.0	5.0	NaN
Young	80.0	100.0	25.0	50.0